Source Code of org.terrier.indexing.ExtensibleSinglePassIndexer

/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is ExtensibleSinglePassIndexer.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Roi Blanco (original author)
 *   Jonathon Hare <jsh2{a.}ecs.soton.ac.uk>
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
 *   
 */
package org.terrier.indexing;


import java.io.IOException;
import java.util.LinkedList;


import org.terrier.structures.FieldDocumentIndexEntry;
import org.terrier.structures.Index;
import org.terrier.structures.SimpleDocumentIndexEntry;
import org.terrier.structures.indexing.DocumentIndexBuilder;
import org.terrier.structures.indexing.singlepass.FileRunIteratorFactory;
import org.terrier.structures.indexing.singlepass.PostingInRun;
import org.terrier.structures.indexing.singlepass.RunsMerger;
import org.terrier.terms.TermPipeline;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.FieldScore;


/**
 * Directly based on BasicSinglePassIndexer, with just a few modifications
 * to enable some extra hooks.
 * 
 * @author Roi Blanco
 * @author Jonathon Hare <jsh2{a.}ecs.soton.ac.uk>
 */
public abstract class ExtensibleSinglePassIndexer extends BasicSinglePassIndexer {




  /** {@inheritDoc} */
  @Override
  protected abstract TermPipeline getEndOfPipeline();


  /**
   * Default constructor
   * @param pathname String the path where the datastructures will 
   * be created. This is assumed to be absolute.
   * @param prefix String the prefix of the index, usually "data".
   */
  public ExtensibleSinglePassIndexer(String pathname, String prefix) {
    super(pathname, prefix);
  }


  /**
   * Get the class for storing postings in runs.
   * @return PostingInRun Subclass of PostingInRun for this indexer
   */
  protected abstract Class<? extends PostingInRun> getPostingInRunClass();
  
  /** {@inheritDoc} */
  @Override
  protected void createRunMerger(String[][] files) throws Exception{
    //modified to use getPostingInRunClass()
    merger = new RunsMerger(new FileRunIteratorFactory(files, getPostingInRunClass(), 0));
  }


  /** {@inheritDoc} */
  @Override
  protected abstract void createMemoryPostings();
  
  /** {@inheritDoc} */
  @Override
  protected abstract void createDocumentPostings();
  
  /**
   * Builds the inverted file and lexicon file for the given collections
   * Loops through each document in each of the collections,
   * extracting terms and pushing these through the Term Pipeline
   * (e.g. stemming, stopping, lowercase, etc.).
   * 
   * Only one thing is modified from BasicSinglePassIndexer - 
   * I've added a pre-processing operation before each term is passed
   * to the pipeline
   * 
   *  @param collections Collection[] the collections to be indexed.
   */
  @Override
  public void createInvertedIndex(Collection[] collections) {
  //  //logger.info("Creating IF (no direct file)..");
    long startCollection, endCollection;
    fileNames = new LinkedList<String[]>();  
    numberOfDocuments = currentId = numberOfDocsSinceCheck = numberOfDocsSinceFlush = numberOfUniqueTerms = 0;
    numberOfTokens = numberOfPointers = 0;
    createMemoryPostings();
    currentIndex = Index.createNewIndex(path, prefix);
    docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document");
    metaBuilder = createMetaIndexBuilder();
    
    emptyDocIndexEntry = (FieldScore.FIELDS_COUNT > 0) ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new SimpleDocumentIndexEntry();
    
    MAX_DOCS_PER_BUILDER = Integer.parseInt(ApplicationSetup.getProperty("indexing.max.docs.per.builder", "0"));
    maxMemory = Long.parseLong(ApplicationSetup.getProperty("indexing.singlepass.max.postings.memory", "0"));
    final boolean boundaryDocsEnabled = BUILDER_BOUNDARY_DOCUMENTS.size() > 0;
    final int collections_length = collections.length;
    boolean stopIndexing = false;
    System.gc();
    memoryAfterFlush = runtime.freeMemory();
  //  logger.debug("Starting free memory: "+memoryAfterFlush/1000000+"M");


    for(int collectionNo = 0; ! stopIndexing && collectionNo < collections_length; collectionNo++)
    {
      Collection collection = collections[collectionNo];
      startCollection = System.currentTimeMillis();
      
      while(collection.nextDocument())
      {
        /* get the next document from the collection */
        //Document doc = collection./next();
        Document doc = collection.getDocument();
        if (doc == null)
          continue;
        //numberOfDocuments++;
        /* setup for parsing */
        createDocumentPostings();


        String term; //term we're currently processing
        numOfTokensInDocument = 0;
        //get each term in the document
        while (!doc.endOfDocument()) {


          if ((term = doc.getNextTerm())!=null && !term.equals("")) {
            termFields = doc.getFields();
            
            //perform pre-op
            preProcess(doc, term); //JH MOD
            
            /* pass term into TermPipeline (stop, stem etc) */
            pipeline_first.processTerm(term);
            /* the term pipeline will eventually add the term to this object. */
          }
          if (MAX_TOKENS_IN_DOCUMENT > 0 &&
              numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT)
            break;
        }
        //if we didn't index all tokens from document,
        //we need to get to the end of the document.
        while (!doc.endOfDocument())
          doc.getNextTerm();
        
        pipeline_first.reset();
        /* we now have all terms in the DocumentTree, so we save the document tree */
        try
        {
          if (termsInDocument.getDocumentLength() == 0)
          {  /* this document is empty, add the minimum to the document index */
            indexEmpty(doc.getAllProperties());
            if (IndexEmptyDocuments)
            {
              currentId++;
              numberOfDocuments++;
            }
          }
          else
          {  /* index this document */
            numberOfTokens += numOfTokensInDocument;
            indexDocument(doc.getAllProperties(), termsInDocument);
          }
        }
        catch (Exception ioe)
        {
  //        logger.error("Failed to index "+doc.getProperty("docno"),ioe);
        }


        if (MAX_DOCS_PER_BUILDER>0 && numberOfDocuments >= MAX_DOCS_PER_BUILDER)
        {
          stopIndexing = true;
          break;
        }


        if (boundaryDocsEnabled && BUILDER_BOUNDARY_DOCUMENTS.contains(doc.getProperty("docno")))
        {
  //        //logger.warn("Document "+doc.getProperty("docno")+" is a builder boundary document. Boundary forced.");
          stopIndexing = true;
          break;
        }
        termsInDocument.clear();
      }
      
      try{
        forceFlush();
        endCollection = System.currentTimeMillis();
        long partialTime = (endCollection-startCollection)/1000;
  //      //logger.info("Collection #"+collectionNo+ " took "+partialTime+ " seconds to build the runs for "+numberOfDocuments+" documents\n");
              
        
        
        docIndexBuilder.finishedCollections();
        if (FieldScore.FIELDS_COUNT > 0)
        {
          currentIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
        }
        else
        {
          currentIndex.addIndexStructure("document-factory", SimpleDocumentIndexEntry.Factory.class.getName(), "", "");
        }
        metaBuilder.close();
        currentIndex.flush();
        
  //      //logger.info("Merging "+fileNames.size()+" runs...");
        startCollection = System.currentTimeMillis();
        
        performMultiWayMerge();
        currentIndex.flush();
        endCollection = System.currentTimeMillis();
  //      //logger.info("Collection #"+collectionNo+" took "+((endCollection-startCollection)/1000)+" seconds to merge\n ");
  //      //logger.info("Collection #"+collectionNo+" total time "+( (endCollection-startCollection)/1000+partialTime));
        long secs = ((endCollection-startCollection)/1000);
  //      if (secs > 3600)
  //                 //logger.info("Rate: "+((double)numberOfDocuments/((double)secs/3600.0d))+" docs/hour");
      } catch (Exception e) {
  //      logger.error("Problem finishing index", e);
      }
    }
    finishedInvertedIndexBuild();
  }  


  
  /**
   * Perform an operation before the term pipeline is initiated.
   * 
   * This could for example extract data and store in a field
   * that the pipeline could access
   * 
   * @param doc Current document
   * @param term Current term
   */
  protected abstract void preProcess(Document doc, String term);


  /**
   * Get the index currently being constructed by this indexer.
   * This might be null if indexing hasn't commenced yet. It is
   * useful for adding extra properties, etc to the index after 
   * indexing is finished.
   * 
   * @return the current index
   */
  public Index getCurrentIndex() {
    return currentIndex;
  }


  /**
   * Delegate for HadoopIndexerMapper to intercept flushes
   */
  protected SinglePassIndexerFlushDelegate flushDelegate;
  
  /**
   * Set the flushDelegate
   * @param _flushDelegate
   */
  protected void setFlushDelegate(SinglePassIndexerFlushDelegate _flushDelegate) {
    this.flushDelegate = _flushDelegate;
  }
  
  /**
   * Get the flushDelegate
   * @return the flushDelegate
   */
  protected SinglePassIndexerFlushDelegate getFlushDelegate() {
    return flushDelegate;
  }
  
  /** 
   * Force the indexer to flush everything and free memory.
   * Either calls the super method, or passes to a delegate if 
   * the flushDelegate is set.
   * @see org.terrier.indexing.BasicSinglePassIndexer#forceFlush()
   */
  @Override
  protected void forceFlush() throws IOException {
    if (flushDelegate == null) super.forceFlush();
    else flushDelegate.forceFlush();
  }
}
Source Code of org.terrier.indexing.ExtensibleSinglePassIndexer

Related Classes of org.terrier.indexing.ExtensibleSinglePassIndexer